In [36]:
import requests
from bs4 import BeautifulSoup
import polars as pl
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default ='vscode'
pl.Config(tbl_cols=-1)
Out[36]:
<polars.config.Config at 0x1173501d0>
In [2]:
# every page of the search
# CB1 within 3 miles
# up to GBP700
urls = [
f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier=OUTCODE%5E409&radius=3.0&_includeSSTC=on&index={i}&sortType=2&channel=BUY&transactionType=BUY&displayLocationIdentifier=CB1.html&maxPrice=700000#prop165096422"
for i in range(0, 1200, 24)
]
In [3]:
relative_urls = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# get the wrapper for all the properties in each page
all_containers = soup.find_all(class_='PropertyCard_propertyCardDescriptionInner__3Vkmk')
# loop through each propery and get the href
# append the href relative url scrape later
for container in all_containers:
relative_urls.append(container.find(href=True).get('href'))
In [4]:
# check the scraped
len(relative_urls)
Out[4]:
1050
In [5]:
# check for the base url
# https://www.rightmove.co.uk/properties/164828633#/?channel=RES_BUY
# e.g. the base in this case is - https://www.rightmove.co.uk
# e.g. the relative is /properties/166143110#/?channel=RES_BUY
# create the full list
full_urls = [f"{'https://www.rightmove.co.uk'}{relative}" for relative in relative_urls]
In [6]:
# click one of the full urls to check for working link
full_urls[0:5]
Out[6]:
['https://www.rightmove.co.uk/properties/87242181#/?channel=RES_BUY', 'https://www.rightmove.co.uk/properties/165096422#/?channel=RES_BUY', 'https://www.rightmove.co.uk/properties/161890835#/?channel=RES_BUY', 'https://www.rightmove.co.uk/properties/166349258#/?channel=RES_BUY', 'https://www.rightmove.co.uk/properties/166452245#/?channel=RES_BUY']
Class Testing¶
- ensure it is finding the correct information and values
In [7]:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/127.0.0.0 Safari/537.36"
}
test_request = requests.get(
'https://www.rightmove.co.uk/properties/161836745#/?channel=RES_BUY',
# full_urls[5],
headers=headers
)
In [8]:
test_request.status_code
Out[8]:
410
In [9]:
test_soup = BeautifulSoup(test_request.content, 'html.parser')
In [10]:
container = test_soup.find_all(class_='_9u6R9n55iQlZi-JF6H59W')
more_info_array = [cont.find(class_='_2zXKe70Gdypr_v9MUDoVCm').text for cont in container]
more_info_template = ['council_tax', 'parking', 'garden', 'accessibility']
dict(zip(more_info_template, more_info_array))
Out[10]:
{'council_tax': 'Band: D',
'parking': 'Off street',
'garden': 'Yes',
'accessibility': 'Ask agent'}
In [11]:
# price
test_soup.find(class_='_1gfnqJ3Vtd1z40MlC0MzXu').find('span').text
Out[11]:
'£700,000'
In [12]:
# street address
test_soup.find(class_='_2uQQ3SV0eMHL1P6t5ZDo2q').text
Out[12]:
'Kelvin Close'
In [13]:
property_container = test_soup.find_all(class_='_3gIoc-NFXILAOZEaEjJi1n')
property_info = [cont.find(class_='_1hV1kqpVceE9m-QrX_hWDN').text for cont in property_container]
property_template = ['property_type', 'bedrooms', 'bathrooms', 'size', 'tenure']
dict(zip(property_template, property_info))
Out[13]:
{'property_type': 'Semi-Detached',
'bedrooms': '3',
'bathrooms': '2',
'size': 'Ask agent',
'tenure': 'Freehold'}
In [14]:
# added date
test_soup.find(class_='_2nk2x6QhNB1UrxdI5KpvaF').text
Out[14]:
'Added on 12/05/2025'
Scraping¶
In [15]:
# create the full list
full_urls = [f"{'https://www.rightmove.co.uk'}{relative}" for relative in relative_urls]
rows = []
for i, url in enumerate(full_urls):
# print(f"Processing URL {i+1}/{len(full_urls)}: {url}")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/127.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Raises an HTTPError for bad responses
soup = BeautifulSoup(response.content, 'html.parser')
# Helper function to safely extract text
def safe_extract(soup, class_name, default="Not found"):
element = soup.find(class_=class_name)
return element.text.strip() if element else default
def safe_extract_nested(soup, class_name, nested_tag, default="Not found"):
element = soup.find(class_=class_name)
if element:
nested = element.find(nested_tag)
return nested.text.strip() if nested else default
return default
# Extract basic information with error handling
address = safe_extract(soup, '_2uQQ3SV0eMHL1P6t5ZDo2q')
added_date = safe_extract(soup, '_2nk2x6QhNB1UrxdI5KpvaF')
price = safe_extract_nested(soup, '_1gfnqJ3Vtd1z40MlC0MzXu', 'span')
# council tax, parking, garden, accessibility
misc_container = soup.find_all(class_='_9u6R9n55iQlZi-JF6H59W')
misc_values = []
for cont in misc_container:
value_elem = cont.find(class_='_2zXKe70Gdypr_v9MUDoVCm')
misc_values.append(value_elem.text.strip() if value_elem else "Not found")
misc_keys = ['council_tax', 'parking', 'garden', 'accessibility']
# Pad with "Not found" if we have fewer values than keys
# this error handling was suggested by Claude AI
while len(misc_values) < len(misc_keys):
misc_values.append("Not found")
misc_info_dict = dict(zip(misc_keys, misc_values))
# property type, bedrooms, bathrooms, size, tenure
property_container = soup.find_all(class_='_3gIoc-NFXILAOZEaEjJi1n')
property_info = []
for cont in property_container:
value_elem = cont.find(class_='_1hV1kqpVceE9m-QrX_hWDN')
property_info.append(value_elem.text.strip() if value_elem else "Not found")
property_keys = ['property_type', 'bedrooms', 'bathrooms', 'size', 'tenure']
# Pad with "Not found" if we have fewer values than keys
while len(property_info) < len(property_keys):
property_info.append("Not found")
property_info_dict = dict(zip(property_keys, property_info))
# create the row dictionary with all information
row = {
'url': url,
'address': address,
'added_date': added_date,
'price': price
}
# add the property and misc info to the row
row.update(property_info_dict)
row.update(misc_info_dict)
# append the complete row to rows list
rows.append(row)
# print(f"Successfully processed: {address}")
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
continue
except Exception as e:
print(f"Error processing {url}: {e}")
continue
# create DataFrame
df = pl.DataFrame(rows)
In [16]:
# inspect the dataframe
df.head()
Out[16]:
shape: (5, 13)
| url | address | added_date | price | property_type | bedrooms | bathrooms | size | tenure | council_tax | parking | garden | accessibility |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str | str | str | str | str | str |
| "https://www.rightmove.co.uk/pr… | "Beech Close, Little Shelford, … | "Reduced on 29/08/2025" | "£600,000" | "Detached Bungalow" | "3" | "1" | "900 sq ft" | "Freehold" | "Band: D" | "Garage,Driveway" | "Yes" | "Ask agent" |
| "https://www.rightmove.co.uk/pr… | "Manor Park, Histon" | "Added on 29/07/2025" | "£700,000" | "Detached" | "3" | "1" | "1,313 sq ft" | "Freehold" | "Band: TBC" | "Driveway,Off street" | "Private garden" | "Ask agent" |
| "https://www.rightmove.co.uk/pr… | "Turvill Place, Cambridge, CB4" | "Reduced on 02/09/2025" | "£700,000" | "Semi-Detached" | "4" | "2" | "1,521 sq ft" | "Freehold" | "Band: E" | "Yes" | "Yes" | "Ask agent" |
| "https://www.rightmove.co.uk/pr… | "Cambridge, Cambridgeshire" | "Added on 29/08/2025" | "£700,000" | "Detached" | "4" | "2" | "1,722 sq ft" | "Freehold" | "Band: F" | "Garage,Allocated" | "Yes" | "Ask agent" |
| "https://www.rightmove.co.uk/pr… | "Sedgwick Street, Cambridge" | "Added on 01/09/2025" | "£700,000" | "Terraced" | "4" | "2" | "Ask agent" | "Freehold" | "Band: C" | "On street" | "Yes" | "Ask agent" |
Data Cleaning¶
In [17]:
clean_df = (
df
.with_columns(
pl.col('price').str.replace_all(r'£|,', '').cast(pl.Float64),
pl.col('bedrooms').str.replace_all(r"1,835 sq ft|2,475 sq ft|Ask agent", '').cast(pl.Categorical, strict=False),
pl.col('bathrooms').cast(pl.Categorical, strict=False),
pl.col('size').str.replace_all(r' sq ft|,', '').cast(pl.Float64, strict=False),
pl.col('council_tax').str.replace_all(r'Band: ', ''),
# pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]+?)').alias('zip_code'),
pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]{1,2})').alias('zip_code'), # postcode followed by the first 2 digits
# pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]{1,2}.{4})').alias('zip_code'),
pl.col('added_date').str.replace('Added on ', '').str.to_datetime(format='%d/%m/%Y', strict=False).dt.date(),
)
.with_columns(
(pl.col('size') / pl.col('bedrooms').cast(pl.Int64, strict=False)).alias('sqft_per_bedroom'),
((pl.col('bedrooms').cast(pl.Int64, strict=False) + pl.col('bathrooms').cast(pl.Int64, strict=False)) / pl.col('size')).alias('bed_bath_density')
)
.rename({'zip_code': 'postcode'})
)
clean_df.head()
Out[17]:
shape: (5, 16)
| url | address | added_date | price | property_type | bedrooms | bathrooms | size | tenure | council_tax | parking | garden | accessibility | postcode | sqft_per_bedroom | bed_bath_density |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | str | date | f64 | str | cat | cat | f64 | str | str | str | str | str | str | f64 | f64 |
| "https://www.rightmove.co.uk/pr… | "Beech Close, Little Shelford, … | null | 600000.0 | "Detached Bungalow" | "3" | "1" | 900.0 | "Freehold" | "D" | "Garage,Driveway" | "Yes" | "Ask agent" | null | 300.0 | 0.004444 |
| "https://www.rightmove.co.uk/pr… | "Manor Park, Histon" | 2025-07-29 | 700000.0 | "Detached" | "3" | "1" | 1313.0 | "Freehold" | "TBC" | "Driveway,Off street" | "Private garden" | "Ask agent" | null | 437.666667 | 0.003046 |
| "https://www.rightmove.co.uk/pr… | "Turvill Place, Cambridge, CB4" | null | 700000.0 | "Semi-Detached" | "4" | "2" | 1521.0 | "Freehold" | "E" | "Yes" | "Yes" | "Ask agent" | "CB4" | 380.25 | 0.003945 |
| "https://www.rightmove.co.uk/pr… | "Cambridge, Cambridgeshire" | 2025-08-29 | 700000.0 | "Detached" | "4" | "2" | 1722.0 | "Freehold" | "F" | "Garage,Allocated" | "Yes" | "Ask agent" | null | 430.5 | 0.003484 |
| "https://www.rightmove.co.uk/pr… | "Sedgwick Street, Cambridge" | 2025-09-01 | 700000.0 | "Terraced" | "4" | "2" | null | "Freehold" | "C" | "On street" | "Yes" | "Ask agent" | null | null | null |
In [18]:
# check all the garden
pl.Config(tbl_rows=-1)
df['garden'].value_counts()
Out[18]:
shape: (20, 2)
| garden | count |
|---|---|
| str | u32 |
| "Private garden,Back garden" | 2 |
| "Patio,Private garden,Enclosed … | 12 |
| "Yes" | 577 |
| "Patio" | 11 |
| "Communal garden,Terrace" | 2 |
| "Front garden,Rear garden" | 4 |
| "Front garden" | 1 |
| "Front garden,Back garden" | 8 |
| "Communal garden" | 23 |
| "Private garden,Patio,Enclosed … | 8 |
| "Rear garden" | 8 |
| "Private garden" | 75 |
| "Private garden,Patio" | 2 |
| "Patio,Private garden" | 1 |
| "Ask developer" | 31 |
| "Terrace" | 3 |
| "On street" | 1 |
| "Back garden" | 8 |
| "Ask agent" | 269 |
| "Not found" | 4 |
In [19]:
# this is to export the data
# !pip install xlsxwriter
# clean_df.write_excel('right_move_CB1_3miles_700kGBP.xlsx')
In [20]:
# for google colab renderer
# !pip install -U kaleido
In [32]:
# explore the data and check the relationship between square fit and size
# also annotate the private gardens
fig = px.scatter(
data_frame=clean_df,
x='size',
y='price',
color='bedrooms',
labels={'size' : 'sqft'},
# trendline='ols',
color_discrete_sequence=px.colors.qualitative.Dark24,
text=np.where(clean_df['garden'].str.to_lowercase().str.contains('private').to_numpy(), 'pg', '')
)
fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.7),
textposition='top center',
textfont=dict(size=10, color='black')
)
fig.show(renderer='notebook')
In [35]:
# majority of the 2 and 1 bedrooms do not have garden - check if they are apartments
fig = px.scatter(
data_frame=clean_df,
x='size',
y='price',
color=np.where(clean_df['property_type'].str.to_lowercase().str.contains('apartment').to_numpy(), 'apartment', 'rest'),
labels={'size' : 'sqft'},
# trendline='ols',
color_discrete_sequence=px.colors.qualitative.Dark24,
text=np.where(clean_df['garden'].str.to_lowercase().str.contains('private').to_numpy(), 'pg', '')
)
fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.7),
textposition='top center',
textfont=dict(size=10, color='black')
)
fig.show(renderer='png')
In [23]:
# feature engineer one new feature
clean_df_2 = (
clean_df
.with_columns(
(pl.col('size') / pl.col('bedrooms').cast(pl.Int64, strict=False)).alias('sqft_per_bedrooms')
)
)
Interpretation:
- Average room space allocation:
- higher values = more spacious properties relative to bedroom count
- lower values = more compact/efficient user of space
- Propperty layout insights:
- for exmplem, a 2000 sqft house with 2 bedrooms = 1000 sqft/bedroom (more specious)
- Value comparison:
- luxury properties will have higher sqft/bedroom rations
- start homes and apartments will have lower ratios
How this metric work?
- filter our properties with least sqft per bedroom
- compare properties with different bedroom counts
- analyze pricing patterns based on space efficiency or outliers (cramped vs spacious)
In [ ]:
In [47]:
# interested in 2-3 bedroom
# want to have a relatively spacious space (doesn't look cramp from the inside)
# we want large total size (sqft), spacious (higher sqft_per_bedroom), and higher bedrooms (3)
# we want light pink to blue
fig = px.scatter(
data_frame=(
clean_df_2
.with_columns(
pl.col('bedrooms').cast(pl.Int64, strict=False)
)
.filter(
pl.col('bedrooms').is_between(2, 3)
)
),
x='sqft_per_bedrooms',
y='price',
color='size',
# trendline='ols',
color_continuous_scale='RdBu',
text='bedrooms'
)
fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.9),
textposition='top center',
textfont=dict(size=10, color='black')
)
fig.show(renderer='jpg')
In [49]:
# seems like the ideal more balanced properties are:
# >1000sqft
# > 500sqft_per_bedrooms
# 2-3 bedrooms
# between 350k to 550k
shortlisted_properties = (
clean_df_2
.with_columns(
pl.col('bedrooms').cast(pl.Int64, strict=False)
)
.filter(
pl.col('size').gt(1000) &
pl.col('bedrooms').is_between(2,3) &
pl.col('price').is_between(350000, 550000)
)
)
In [70]:
fig = px.bar(
shortlisted_properties,
y='address',
x='price',
)
# Make the y-axis labels (addresses) clickable
fig.update_layout(
template='ggplot2',
width=1000,
)
# Show as interactive HTML (remove renderer='png')
fig.show(renderer='png')
In [73]:
# shortlisted_properties.write_excel('shortlisted_properties.xlsx')
Out[73]:
<xlsxwriter.workbook.Workbook at 0x126ddc2c0>
In [48]:
# out of curiousity check the property price by postcode
# for exactly 2 or 3 bedrooms
fig = px.box(
data_frame=clean_df_2,
x='postcode',
y='price',
points='all',
facet_col=np.where(
clean_df_2['bedrooms'].cast(pl.Int64, strict=False).is_between(2,3).to_numpy(),
'2 or 3 bedrooms', 'rest'
),
color='bedrooms',
title='Property Prices by Postcode: 2-3 Bedrooms vs Others'
)
fig.update_layout(
template='ggplot2',
width=1200,
yaxis_tickformat='£,.0f',
xaxis_title='Postcode',
yaxis_title='Price (£)'
)
# Rotate postcode labels if they're crowded
fig.update_xaxes(tickangle=45)
fig.show(renderer='jpg')